Resources:
There are 2 ways to install sgpv package.
install.packages("sgpv")devtools::install_github("weltybiostat/sgpv")#GitHub
install.packages("devtools")
devtools::install_github("weltybiostat/sgpv")
#CRAN
install.packages("sgpv")
Functions Included:
sgpvalue()
sgpower()
plotsgpv()
plotman()
fdrisk()
3 inference outcomes:
2 underlying truths:
cidemo = function(rep=100,
mu=0.8,
n=200,
alpha=0.05,
ylb="Probability",
ltype=1,
lcol=4,
ltype1=1,
lwd1=2,
lcol1=2,
plotset=F,
hilim=1,
lolim=0.5,
bcol="lemonchiffon2",
xleg=45,
yleg=0.65,
idz.lo=0.75,
idz.hi=0.85,
zone=TRUE){
cints=matrix(0,3,rep)
cints[1,]=seq(1,rep,1)
sigma=sqrt(mu*(1-mu))
for (i in 1:rep) {
dat=rbinom(n,1,prob=mu)
## Standard normal quantile
za=abs(qnorm(alpha/2))
## upper CI limit
cints[2,i]=mean(dat)+za*sigma/sqrt(n)
## Lower CI limit
cints[3,i]=mean(dat)-za*sigma/sqrt(n)
}
if (plotset==T){
lolim=mu-5*sigma/sqrt(n)
hilim=mu+5*sigma/sqrt(n)
}
if (sigma!=1){
plot(cints[1,],cints[2,],ylim=c(lolim,hilim),type="n",
main=paste("95% Confidence Intervals for the Probability\n Sample Size of ",n," per Endpoint"),
ylab=ylb,xlab="Replication")
}
if (sigma==1){
plot(cints[1,],cints[2,],ylim=c(lolim,hilim),type="n",
main=paste("95% Confidence Intervals on the Standardized Effect Size \n Sample Size of ",n," per Endpoint"),
ylab="Standardized Effect Size (X/sigma units)",xlab="Endpoint Identification Number")
}
if (zone==TRUE) {
polygon(c(0,rep,rep,0),c(idz.hi,idz.hi,idz.lo,idz.lo),density=200,col=bcol)
}
abline(h=mu)
points(cints[1,],cints[2,],cex=0.6,pch=16,col=lcol)
points(cints[1,],cints[3,],cex=0.6,pch=16,col=lcol)
segments(cints[1,], cints[2,], cints[1,], cints[3,],lty=ltype,col=lcol)
ct=0
ct2=0
for (i in 1:rep) {
if (cints[2,i]<mu | cints[3,i]>mu){
segments(cints[1,i], cints[2,i],
cints[1,i],cints[3,i],
lty=ltype1,col=lcol1,lwd=lwd1)
points(cints[1,i],cints[2,i],cex=0.6,pch=16,col=lcol1)
points(cints[1,i],cints[3,i],cex=0.6,pch=16,col=lcol1)
ct=ct+1}
if (cints[2,i]<idz.lo | cints[3,i]>idz.hi){
ct2=ct2+1
}
}
if (rep>100){
abline(h=idz.hi,col=bcol,lty=2,lwd=1.5)
abline(h=idz.lo,col=bcol,lty=2,lwd=1.5)
}
## prob of missing indifference zone
## P(CI upper limit<idz.lo) +P(CI lower limit>idz.hi)
miss.idz=pnorm(-za+idz.lo*sqrt(n)/sigma)+pnorm(-za-idz.hi*sqrt(n)/sigma)
if (zone==TRUE){
legend(xleg,yleg,bty="n",
c(paste("Missed True Value: ",ct," / ",rep,sep=""),
paste("Missed Indifference Zone: ",ct2," / ",rep,sep="")))
}
if (zone!=TRUE){
legend(xleg,yleg,bty="n",
c(paste("Missed True Value: ",
ct," / ",rep,sep="")))
}
}
set.seed(999)
cidemo(n=75,zone=FALSE)
cidemo(n=75,zone=FALSE)
cidemo(n=400,zone=FALSE)
cidemo(n=400,zone=FALSE)
cidemo(n=75)
cidemo(n=75)
cidemo(n=400)
cidemo(n=400)
prob.alt.sg = function(n, null.delta,
theta0=0,
theta=0,
V=1,
alpha=0.05){
results = rep(NA, length(n))
results = pnorm(sqrt(n)*(theta0-null.delta)/sqrt(V)-sqrt(n)*theta/sqrt(V)-qnorm(1-alpha/2))+
pnorm(-1*sqrt(n)*(theta0+null.delta)/sqrt(V)+sqrt(n)*theta/sqrt(V)-qnorm(1-alpha/2))
results
}
prob.null.sg = function(n,
null.delta,
theta0=0,
theta=0,
V=1,
alpha=0.05){
results = rep(NA, length(n))
if(length(null.delta)>1){
con.large = which(null.delta>(qnorm(1-alpha/2)*sqrt(V)/sqrt(n)))
con.small = which(null.delta<=(qnorm(1-alpha/2)*sqrt(V)/sqrt(n)))
results[con.small] = rep(0, length(con.small))
results[con.large] =
pnorm(sqrt(n[con.large])*theta0+sqrt(n[con.large])*null.delta[con.large]/sqrt(V)-sqrt(n[con.large])*theta/sqrt(V)-qnorm(1-alpha/2))-
pnorm(sqrt(n[con.large])*theta0-sqrt(n[con.large])*null.delta[con.large]/sqrt(V)-sqrt(n[con.large])*theta/sqrt(V)+qnorm(1-alpha/2))
}else if(null.delta>(qnorm(1-alpha/2)*sqrt(V)/sqrt(n))){
results =
pnorm(sqrt(n)*theta0+sqrt(n)*null.delta/sqrt(V)-sqrt(n)*theta/sqrt(V)-qnorm(1-alpha/2))-
pnorm(sqrt(n)*theta0-sqrt(n)*null.delta/sqrt(V)-sqrt(n)*theta/sqrt(V)+qnorm(1-alpha/2))
}else{
results = rep(0, length(theta))
}
results
}
prob.incon.sg = function(n, null.delta,
theta0=0,
theta=0,
V=1,
alpha=0.05){
results = rep(NA, length(n))
con.large = which(null.delta>(qnorm(1-alpha/2)*sqrt(V)/sqrt(n)))
con.small = which(null.delta<=(qnorm(1-alpha/2)*sqrt(V)/sqrt(n)))
results = 1-(prob.alt.sg(n, null.delta,
theta0,
theta,
V,
alpha)+
prob.null.sg(n, null.delta,
theta0,
theta,
V,
alpha))
results
}
theta.1 = seq(-3,3,by=0.01)
plot(theta.1, prob.alt.sg(n=50,
null.delta = 0,
theta=theta.1), type="l",
ylim=c(0,1),
xlab="Alternative",
ylab="Probability",
main="Prob of being consistent with alternative")
lines(theta.1, prob.alt.sg(n=50,
null.delta = 0.5,
theta=theta.1),
col="green")
lines(theta.1, prob.alt.sg(n=50,
null.delta = 1,
theta=theta.1),
col="blue")
abline(h=0.05, lty=2)
legend("bottomright",
legend=c("Delta=0", "Delta=0.5", "Delta=1"),
col=c("black", "green","blue"), lty=1)
theta.1 = seq(-3,3,by=0.01)
plot(theta.1, prob.null.sg(n=50,
null.delta = 0,
theta=theta.1), type="l",
ylim=c(0,1),
xlab="Alternative",
ylab="Probability",
main="Prob of being consistent with null")
lines(theta.1, prob.null.sg(n=50,
null.delta = 0.5,
theta=theta.1),
col="green")
lines(theta.1, prob.null.sg(n=50,
null.delta = 1,
theta=theta.1),
col="blue")
abline(h=0.05, lty=2)
legend("bottomright",
legend=c("Delta=0", "Delta=0.5", "Delta=1"),
col=c("black", "green","blue"), lty=1)
theta.1 = seq(-3,3,by=0.01)
plot(theta.1, prob.incon.sg(n=50,
null.delta = 0,
theta=theta.1), type="l",
ylim=c(0,1),
xlab="Alternative",
ylab="Probability",
main="Prob of being inconclusive")
lines(theta.1, prob.incon.sg(n=50,
null.delta = 0.5,
theta=theta.1),
col="green")
lines(theta.1, prob.incon.sg(n=50,
null.delta = 1,
theta=theta.1),
col="blue")
abline(h=0.05, lty=2)
legend("bottomright",
legend=c("Delta=0", "Delta=0.5", "Delta=1"),
col=c("black", "green","blue"), lty=1)
sgpv::fdrisk()
###### FDR rates
# false discovery risk with 95% confidence level
fdrisk(sgpval = 0,
null.lo = log(1/1.1),
null.hi = log(1.1),
std.err = 0.8,
null.weights = 'Uniform',
null.space = c(log(1/1.1), log(1.1)),
alt.weights = 'Uniform',
alt.space = 2 + c(-1,1)*qnorm(1-0.05/2)*0.8,
interval.type = 'confidence',
interval.level = 0.05)
## [1] 0.05949861
## with truncated normal weighting distribution
fdrisk(sgpval = 0,
null.lo = log(1/1.1),
null.hi = log(1.1),
std.err = 0.8,
null.weights = 'Point',
null.space = 0,
alt.weights = 'TruncNormal',
alt.space = 2 + c(-1,1)*qnorm(1-0.041/2)*0.8,
interval.type = 'likelihood', interval.level = 1/8)
## [1] 0.04902575
# false discovery risk with LSI and wider null hypothesis
fdrisk(sgpval = 0,
null.lo = log(1/1.5),
null.hi = log(1.5),
std.err = 0.8,
null.weights = 'Point',
null.space = 0,
alt.weights = 'Uniform',
alt.space = 2.5 + c(-1,1)*qnorm(1-0.041/2)*0.8,
interval.type = 'likelihood',
interval.level = 1/8)
## [1] 0.01688343
# false confirmation risk example
fdrisk(sgpval = 1,
null.lo = log(1/1.5),
null.hi = log(1.5),
std.err = 0.15,
null.weights = 'Uniform',
null.space = 0.01 + c(-1,1)*qnorm(1-0.041/2)*0.15,
alt.weights = 'Uniform',
alt.space = c(log(1.5), 1.25*log(1.5)),
interval.type = 'likelihood',
interval.level = 1/8)
## [1] 0.03059522
###
##
#
False Discovery and Confirmation Rates
power.pv=function(n=20,sd=1,alpha=0.05,mu.l=-3,mu.u=3,delta=0.3,mu.0=0,r=1){
## Computes power when delta is zero and non-zero
## Inputs:
## n is sample size
## indifference zone is: mu.0 +/- delta
## alternative space is: mu in [mu.l, mu.u]
## data ~ N(mu,sd)
## r = P(H1)/P(H0) for simple H1 and H0
mu=seq(mu.l,mu.u,0.01)
## Power (null is mu.0 ; alt is mu)
pow1.d=pnorm(-sqrt(n)*(delta-(mu.0-mu))/sd-qnorm(1-alpha/2))
pow2.d=pnorm(-sqrt(n)*(delta+(mu.0-mu))/sd-qnorm(1-alpha/2))
power.d=pow1.d+pow2.d
## Power without indifference zone (delta=0)
pow1.0=pnorm(-sqrt(n)*(0-(mu.0-mu))/sd-qnorm(1-alpha/2))
pow2.0=pnorm(-sqrt(n)*(0+(mu.0-mu))/sd-qnorm(1-alpha/2))
power.0=pow1.0+pow2.0
## Type I error (null is mu.0 ; alt is mu.0)
alpha.d=2*pnorm(-sqrt(n)*(delta)/sd-qnorm(1-alpha/2))
## Type I error without indifference zone (delta=0)
alpha.0=2*pnorm(-sqrt(n)*(0)/sd-qnorm(1-alpha/2))
## P(sgpv=1|H0) not quite alpha b/c rules out inconclusive
gam1.0=pnorm(sqrt(n)*(delta)/sd-qnorm(1-alpha/2))
gam2.0=pnorm(sqrt(n)*(-delta)/sd+qnorm(1-alpha/2))
gam.0=(gam1.0-gam2.0)*(delta>qnorm(1-alpha/2)*sd/sqrt(n))
##
gam1.a=pnorm(sqrt(n)*(delta+mu.0-mu)/sd-qnorm(1-alpha/2))
gam2.a=pnorm(sqrt(n)*(mu.0-delta-mu)/sd+qnorm(1-alpha/2))
gam.a=(gam1.a-gam2.a)*(delta>qnorm(1-alpha/2)*sd/sqrt(n))
## False discovery rates (fdr1=FDR ; fdr0=FNDR)
fdr.d = 1 / (1 + power.d/alpha.d * r)
fndr.0 = 1 / (1 + (1-alpha.0)/(1-power.0) * (1/r))
fdr.0 = 1 / (1 + power.0/alpha.0 * r)
fndr.d = 1 / (1 + gam.0/gam.a * (1/r))
ret=cbind(mu,alpha.d,power.d,fdr.d,fndr.d,
alpha.0,power.0,fdr.0,fndr.0,gam.0,gam.a)
colnames(ret)=c('mu','alpha.d','power.d','fdr.d','fndr.d',
'alpha.0','power.0','fdr.0','fndr.0',
'gam.0','gam.a')
ret
}
par(mfrow=c(1,1))
prev.ratio=1
set.delta=0.5
set.n=16
all=power.pv(delta=set.delta,n=set.n,r=prev.ratio,alpha=0.05)
plot(all[,'mu'],all[,'fdr.0'],type="n",ylab="Probability",xlab="Alternative in SD units",
ylim=c(0,prev.ratio/(prev.ratio+1)),xlim=c(-3,3))
title(main="False discovery and confirmation rates")
lines(all[,'mu'],all[,'fdr.0'],col="firebrick",lwd=1,lty=2)
lines(all[,'mu'],all[,'fndr.0'],col="dodgerblue",lwd=1,lty=2)
lines(all[,'mu'],all[,'fdr.d'],col="firebrick",lwd=2)
lines(all[,'mu'],all[,'fndr.d'],col="dodgerblue",lwd=2)
legend("topleft",c("",
expression(paste("P-value < ",0.05,sep="")),
expression(paste(P[delta]," = 0",sep="")),
"", "",
expression(paste("P-value > ",0.05,sep="")),
expression(paste(P[delta]," = 1",sep=""))),
col=c("","firebrick","firebrick","","",
"dodgerblue","dodgerblue"),
lty=c(NA,2,1,NA,NA,2,1),lwd=c(NA,1,2,NA,NA,1,2),bty="n")
text(-2.875,0.5,"FDR")
text(-2.875,0.42,"FCR")
axis(side = 1, at = c(-set.delta,set.delta),cex.axis=0.8,col.ticks="black",lwd.ticks=1.5,tcl=0.6,labels=FALSE)
axis(side = 3, at = c(-set.delta,set.delta),cex.axis=0.8,col.ticks="black",lwd.ticks=1.5,tcl=0.6,labels=FALSE)
text(0,-0.0125,expression(paste(H[0]," Zone",sep="")),cex=0.8)
text(0,prev.ratio/(prev.ratio+1)*1.015,expression(paste(H[0]," Zone",sep="")),cex=0.8)
text(2.5,0.05/(prev.ratio+0.05)+.01,"alpha/(alpha+r)")
Purpose: Evaluate how different techniques of setting the indifference zone effect the SGPV study properties.
What options does a collaborator have when they are uncertain of the indifference zone?
n= seq(1,250, by=0.1)
V=1
alpha=0.05
null.delta0 = rep(0.5, length(n))
null.delta1 = rep(0, length(n))
null.delta2 = rep(1.25, length(n))
null.delta3 = rep(1, length(n))
null.delta4 = rep(0.75, length(n))
null.delta5 = rep(0.25, length(n))
ci.bound = qnorm(1-alpha/2)*sqrt(V)/sqrt(n)
plot(n, null.delta1,
type="l",
col="red",
lwd=2,
ylim=c(0,3),
ylab=TeX(r'(Half Interval Width $ (\delta)$)'),
xlab = "Sample Size (n)",
main="")
lines(n, null.delta2,
col="blue",
lwd=2)
lines(n, null.delta3,
col="purple",
lwd=2)
lines(n, null.delta4,
col="darkgreen",
lwd=2)
lines(n, null.delta5,
col="orange",
lwd=2)
lines(n, null.delta0,
col="hotpink",
lwd=2)
lines(n, ci.bound,
col="black",
lty=2,
lwd=2)
# abline(h=0.05, col="black", lty=2)
legend(130,
3,
col=c("black",
"red",
"blue",
"purple",
"darkgreen",
"hotpink",
"orange"),
box.lty=0,
lty=c(2,
1,
1,
1,
1,
1,
1),
cex=0.65,
lwd=2,
legend=c(TeX(r'(Uncertainty Interval $\left(\sqrt{1/n}\right)$)'),
"Point Null: Constant at 0",
"SGPV: Constant at 1.25",
"SGPV: Constant at 1",
"SGPV: Constant at 0.75",
"SGPV: Constant at 0.5",
"SGPV: Constant at 0.25"))
par(mfrow=c(1,2))
par(mar=c(2,2,2,2))
plot(n, prob.alt.sg(n, null.delta1,
theta=0),
type="l",
col="red",
lwd=2,
ylim=c(0,0.055),
ylab="",
main=TeX(r'(Type 1 Error $\beta_0$)'))
lines(n, prob.alt.sg(n, null.delta2,
theta=0),
col="blue",
lwd=2)
lines(n, prob.alt.sg(n, null.delta3,
theta=0),
col="purple",
lwd=2)
lines(n, prob.alt.sg(n, null.delta4,
theta=0),
col="darkgreen",
lwd=2)
lines(n, prob.alt.sg(n, null.delta5,
theta=0),
col="orange",
lwd=2)
lines(n, prob.alt.sg(n, null.delta0,
theta=0),
col="hotpink",
lwd=2)
abline(h=0.05, col="black", lty=2)
plot(n, prob.alt.sg(n, null.delta1,
theta=1),
type="l",
col="red",
lwd=2,
ylim=c(0,1),
ylab="",
main=TeX(r'(Power $\beta_1$)'))
lines(n, prob.alt.sg(n, null.delta2,
theta=1),
col="blue",
lwd=2)
lines(n, prob.alt.sg(n, null.delta3,
theta=1),
col="purple",
lwd=2)
lines(n, prob.alt.sg(n, null.delta4,
theta=1),
col="darkgreen",
lwd=2)
lines(n, prob.alt.sg(n, null.delta5,
theta=1),
col="orange",
lwd=2)
lines(n, prob.alt.sg(n, null.delta0,
theta=1),
col="hotpink",
lwd=2)
legend(70, 0.5,
col=c("red",
"blue",
"purple",
"darkgreen",
"hotpink",
"orange"),
box.lty=0,
lty=c(1,
1,
1,
1,
1,
1),
cex=0.7,
lwd=2,
legend=c("Point Null: Constant at 0",
"SGPV: Constant at 1.25",
"SGPV: Constant at 1",
"SGPV: Constant at 0.75",
"SGPV: Constant at 0.5",
"SGPV: Constant at 0.25"))
par(mfrow=c(1,1))
par(mar=c(4,4,2,2))
plot(n, prob.null.sg(n, null.delta1,
theta=0)+prob.alt.sg(n, null.delta1,
theta=1),
type="l",
col="red",
lwd=2,
ylim=c(0,2),
ylab=TeX(r'( $\beta_1$ + $\omega_0$)'),
main= TeX(r'(Sum of Power and Probability of Null ($\beta_1$ +$\omega_0$))'))
lines(n, prob.null.sg(n, null.delta2,
theta=0)+prob.alt.sg(n, null.delta2,
theta=1),
col="blue",
lwd=2)
# lines(n, prob.null.sg(n, null.delta3,
# theta=0),
# col="purple",
# lwd=2)
lines(n, prob.null.sg(n, null.delta4,
theta=0)+prob.alt.sg(n, null.delta4,
theta=1),
col="darkgreen",
lwd=2)
lines(n, prob.null.sg(n, null.delta5,
theta=0)+prob.alt.sg(n, null.delta5,
theta=1),
col="orange",
lwd=2)
lines(n, prob.null.sg(n, null.delta0,
theta=0)+prob.alt.sg(n, null.delta0,
theta=1),
col="hotpink",
lwd=2)
legend(172, 0.7,
col=c("red",
"blue",
"purple",
"darkgreen",
"hotpink",
"orange"),
box.lty=0,
lty=c(1,
1,
1,
1,
1,
1,
2,
2,
2,
2,
2,
2),
cex=0.7,
lwd=2,
legend=c(TeX(r'( Constant at 0)'),
TeX(r'(Constant at 1.25)'),
TeX(r'( Constant at 1)'),
TeX(r'( Constant at 0.75)'),
TeX(r'(Constant at 0.5)'),
TeX(r'( Constant at 0.25)')))
par(mfrow=c(1,2))
par(mar=c(2,2,4,2))
plot(n, prob.null.sg(n, null.delta1,
theta=0),
type="l",
col="red",
lwd=2,
ylim=c(0,1),
ylab="",
main= TeX(r'(Probability of Null $\omega_0$)'))
lines(n, prob.null.sg(n, null.delta2,
theta=0),
col="blue",
lwd=2)
lines(n, prob.null.sg(n, null.delta3,
theta=0),
col="purple",
lwd=2)
lines(n, prob.null.sg(n, null.delta4,
theta=0),
col="darkgreen",
lwd=2)
lines(n, prob.null.sg(n, null.delta5,
theta=0),
col="orange",
lwd=2)
lines(n, prob.null.sg(n, null.delta0,
theta=0),
col="hotpink",
lwd=2)
plot(n, prob.null.sg(n, null.delta1,
theta=1),
type="l",
col="red",
lwd=2,
ylim=c(0,1),
ylab="",
main=TeX(r'(Probability of Null $\omega_1$)'))
lines(n, prob.null.sg(n, null.delta2,
theta=1),
col="blue",
lwd=2)
lines(n, prob.null.sg(n, null.delta3,
theta=1),
col="purple",
lwd=2)
lines(n, prob.null.sg(n, null.delta4,
theta=1),
col="darkgreen",
lwd=2)
lines(n, prob.null.sg(n, null.delta5,
theta=1),
col="orange",
lwd=2)
lines(n, prob.null.sg(n, null.delta0,
theta=1),
col="hotpink",
lwd=2)
legend(70, 0.5,
col=c("red",
"blue",
"purple",
"darkgreen",
"hotpink",
"orange"),
box.lty=0,
lty=c(1,
1,
1,
1,
1,
1),
cex=0.7,
lwd=2,
legend=c("Point Null: Constant at 0",
"SGPV: Constant at 1.25",
"SGPV: Constant at 1",
"SGPV: Constant at 0.75",
"SGPV: Constant at 0.5",
"SGPV: Constant at 0.25"))
par(mfrow=c(1,2))
par(mar=c(2,2,4,2))
plot(n,prob.incon.sg(n, null.delta1,
theta=0),
type="l",
col="red",
lwd=2,
ylim=c(0,1),
ylab="",
main=TeX(r'(Probability of Inconclusive $\gamma_0$)'))
lines(n,prob.incon.sg(n, null.delta2,
theta=0),
col="blue",
lwd=2)
lines(n,prob.incon.sg(n, null.delta3,
theta=0),
col="purple",
lwd=2)
lines(n,prob.incon.sg(n, null.delta4,
theta=0),
col="darkgreen",
lwd=2)
lines(n,prob.incon.sg(n, null.delta5,
theta=0),
col="orange",
lwd=2)
lines(n,prob.incon.sg(n, null.delta0,
theta=0),
col="hotpink",
lwd=2)
plot(n,prob.incon.sg(n, null.delta1,
theta=1),
type="l",
col="red",
lwd=2,
ylim=c(0,1),
ylab="",
main=TeX(r'(Probability of Inconclusive $\gamma_1$)'))
lines(n,prob.incon.sg(n, null.delta2,
theta=1),
col="blue",
lwd=2)
lines(n,prob.incon.sg(n, null.delta3,
theta=1),
col="purple",
lwd=2)
lines(n,prob.incon.sg(n, null.delta4,
theta=1),
col="darkgreen",
lwd=2)
lines(n,prob.incon.sg(n, null.delta5,
theta=1),
col="orange",
lwd=2)
lines(n,prob.incon.sg(n, null.delta0,
theta=1),
col="hotpink",
lwd=2)
legend(90, 0.8,
col=c("red",
"blue",
"purple",
"darkgreen",
"hotpink",
"orange"),
box.lty=0,
lty=c(1,
1,
1,
1,
1,
1),
cex=0.7,
lwd=2,
legend=c("Point Null: Constant at 0",
"SGPV: Constant at 1.25",
"SGPV: Constant at 1",
"SGPV: Constant at 0.75",
"SGPV: Constant at 0.5",
"SGPV: Constant at 0.25"))
Let’s see how these look on a plot:
n2= seq(-10,300, by=0.1)
plot(n, ci.bound,
type="n",
col="black",
lty=2,
lwd=2,
ylim=c(-2,2.2),
xlim=c(0,225),
ylab="",
xlab = "Sample Size (n)",
main="",
yaxt = "n")
# lines(n, -1*ci.bound,
# col="black",
# lty=2,
# lwd=2)
lines(n, null.delta2,
col="blue",
lwd=2)
lines(n, -1*null.delta2,
col="blue",
lwd=2)
lines(n, null.delta4,
col="hotpink",
lwd=2)
lines(n, -1*null.delta4,
col="hotpink",
lwd=2)
lines(n, null.delta5,
col="orange",
lwd=2)
lines(n, -1*null.delta5,
col="orange",
lwd=2)
abline(h=k, col="black", lty=1,lwd=2)
abline(h=-1*k, col="black", lty=1,lwd=2)
abline(h=0, col="red", lty=1,lwd=2)
abline(h=1, col="red", lty=2,lwd=2)
polygon(c(n2, rev(n2)), c(rep(-1*k, length(n2)), rev(rep(-1*(k/2), length(n2)))),
border = NA,
col = rgb(210/633,210/633,210/633, alpha = 0.2))
polygon(c(n2, rev(n2)), c(rep((k/2), length(n2)), rev(rep(k, length(n2)))),
border = NA,
col = rgb(210/633,210/633,210/633, alpha = 0.2))
axis(2, at = c(-2,-1,-0.9,-0.45,0,
2,1,0.9,0.45),
labels = c(-2,-1,-0.9,-0.45,TeX(r'($\theta_0=0$)'),
2,TeX(r'($\theta_1=1$)'),0.9,0.45),
las=1)
legend(135,
2.25,
col=c("red",
"red",
"black",
"blue",
"hotpink",
"orange"),
box.lty=0,
lty=c(1,
2,
1,
1,
1,
1),
cex=0.9,
lwd=2,
legend=c(TeX(r'(Null Hypothesis $\theta_0=0$)'),
TeX(r'(Alternative Hypothesis $\theta_1=1$)'),
"Original SGPV: Constant at 0.9",
TeX(r'($delta_1(n)\propto n^{-1/4}$)'),
TeX(r'($delta_2(n)\propto n^{-1/3}$)'),
TeX(r'($delta_3(n)\propto n^{-1/2}$)')))
n=6
dat = rnorm(n, 0, 1)
a = t.test(dat)$conf.int[1]
b = t.test(dat)$conf.int[2]
x_bar = mean(dat)
S = sd(dat)
Z = (b-x_bar)/(S/sqrt(n))
theta_p = 0.5*0.75
theta_m = -0.5*0.75
out_TOST = tsum_TOST(m1=x_bar,
mu=0,
sd1=S,
n1=n,
low_eqbound=theta_m,
high_eqbound=theta_p,
alpha=0.05,
eqbound_type = "raw")
out_TOST$TOST
## t SE df p.value
## t-test 0.1645490 0.4629848 5 0.8757443
## TOST Lower 0.9745108 0.4629848 5 0.1872798
## TOST Upper -0.6454127 0.4629848 5 0.2735524
max(out_TOST$TOST$p.value[2], out_TOST$TOST$p.value[3])
## [1] 0.2735524
out_sgpv = sgpvalue(est.lo=a,
est.hi=b,
null.lo=theta_m,
null.hi=theta_p)
out_sgpv
## $p.delta
## [1] 0.5
##
## $delta.gap
## [1] NA
set.seed(999)
iter=500
n=6
results = as.data.frame(matrix(ncol=3, nrow=iter))
colnames(results) = c("sgpv", "tost", "Case")
half.case = NULL
half.case2 = NULL
for(i in 1:iter){
dat = rnorm(n, 0, 1)
a = t.test(dat)$conf.int[1]
b = t.test(dat)$conf.int[2]
x_bar = mean(dat)
S = sd(dat)
Z = (b-x_bar)/(S/sqrt(n))
theta_p = 0.5*0.75
theta_m = -0.5*0.75
out_TOST = tsum_TOST(m1=x_bar,
mu=0,
sd1=S,
n1=n,
low_eqbound=theta_m,
high_eqbound=theta_p,
alpha=0.05,
eqbound_type = "raw")
out_sgpv = sgpvalue(est.lo=a,
est.hi=b,
null.lo=theta_m,
null.hi=theta_p)$p.delta
results[i,1] = out_sgpv
results[i,2] = pmax(out_TOST$TOST$p.value[2],out_TOST$TOST$p.value[3])
if(theta_m<a & theta_p>a & theta_p<b){
# Case 4 some overlap
results[i,3] = "\n Case 4: \nPartial Overlap"
}else if(theta_m<b & theta_p>b & theta_m>a){
# Case 3 some overlap
results[i,3] = "\n Case 3: \nPartial Overlap"
}else if(theta_m>b | theta_p<a){
# Case 1 no overlap
results[i,3] = "\n Case 1: \nNo Overlap"
}else if((theta_m>a & theta_p<b)|(theta_m<=a & theta_p>=b)){
# Case 2 complete overlap
results[i,3] = "\n Case 2: \nComplete Overlap"
}
}
library(ggplot2)
scaleFUN <- function(x) sprintf("%.1f", x)
g = ggplot(results, aes(sgpv, tost))+
xlab("SGPVs") +
ylab("Equivalence p-values") +
geom_point(aes(alpha=0.2)) +
geom_abline(intercept = 1, slope = -1,linetype='dashed', alpha=0.6)+
xlim(c(0,1))+
scale_x_continuous(breaks= c( 0,0.120001,0.880001,1),labels=c("0", "~0.1", "~0.9", "1"), minor_breaks = c(0.12,0.88),
sec.axis=sec_axis(~./1, name="",breaks= c( 0.04,0.51,0.98), labels=c("Consistent \n with Alternative", "\n Inconclusive", "Consistent \n with Null")))+
scale_y_continuous(breaks= c(0,0.07,1), labels=c("0", expression(alpha), "1"),minor_breaks = c(0.070001),
sec.axis=sec_axis(~./1, name="", breaks= c(0.005,0.61), labels=c("Consistent \n with Null","Inconclusive")))+
annotate(geom="text", x=0, y=0.9, label="D",
size=5,
family = "Courier-Bold")+
annotate(geom="text", x=0.85, y=0.98, label="E",
size=5,
family = "Courier-Bold")+
annotate(geom="text", x=0.91, y=0.98, label="F",
size=5,
family = "Courier-Bold")+
annotate(geom="text", x=0, y=0, label="H",
size=5,
family = "Courier-Bold")+
annotate(geom="text", x=0.15, y=0, label="I",
size=5,
family = "Courier-Bold")+
annotate(geom="text", x=0.91, y=0, label="J",
size=5,
family = "Courier-Bold")+
theme_bw()+
theme(axis.text.y = element_text(size=10, hjust=-0.5),
axis.text.y.right = element_text(size=10, hjust=0.7),
axis.text.x.top = element_text(size=10, vjust=2),
axis.text.x = element_text(size=10, vjust=-0.5),
panel.grid.major = element_blank(),
panel.grid.minor = element_line(size = 0.8,colour="darkgrey"))+
guides(alpha=FALSE)
g
ggplot(results, aes(sgpv, tost, col=Case, pch=Case))+
geom_point(aes(), cex=1.5) +
# geom_jitter(width = 0.02, height = 0.02)+
# ggtitle(paste0("Derived Sample Size= ",n," Iterations= ", iter))+
xlab("SGPV") +
ylab("TOST Eq PV") +
# facet_wrap(~ Case) +
geom_abline(intercept = 1, slope = -1,linetype='dashed', alpha=0.6)+
scale_color_manual(values=c("#0000B9",
"#00A300",
"#FD49A0",
"#FFA500",
"black"))+
scale_shape_manual(values=c(16,21,17,24,21))+
guides(alpha=FALSE)+
scale_y_continuous(labels=scaleFUN)+
scale_x_continuous(labels=scaleFUN)+
guides(color = guide_legend(byrow = TRUE))+
theme_bw()+
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.spacing.y = unit(0.3, 'cm'))
We first present an example by applying the ProSGPV algorithm to a simulated dataset by use of gen.sim.data function.
With sample size n = 100, number of variables p = 10, number of true signals s = 4, smallest effect size \(\beta_{min}\) = 1, largest effect size \(\beta_{max}\) = 5, autoregressive correlation ρ = 0.2 and variance σ2 = 1, signal-to-noise ratio (SNR) \(\nu\)= 2.
We generate outcomes Y following Gaussian distribution.
gen.sim.data outputs X, Y, indices of true signals, and a
vector of true coefficients.
set.seed(1)
sim.data <- gen.sim.data(n = 50, p = 10, s = 4,
family = "gaussian",
beta.min = 1, beta.max = 5,
rho = 0.2, nu = 2)
x <- sim.data[[1]]
y <- sim.data[[2]]
(true.index <- sim.data[[3]])
## [1] 2 4 5 7
true.beta <- sim.data[[4]]
pro.sgpv function takes inputs of explanatory variables
x, outcome y, outcome type family (default is “gaussian”), stage
indicator (default is 2), and a GVIF indicator (default is FALSE).sgpv.out.2 <- pro.sgpv(x,y)
sgpv.out.2
## Selected variables are V2 V5 V7
# true.beta
true.index
## [1] 2 4 5 7
plot(sgpv.out.2)
Model Summary
summary(sgpv.out.2)
##
## Call:
## lm(formula = Response ~ ., data = data.d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.0068 -2.3711 0.6028 2.7228 10.3547
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.08957 0.71297 -0.126 0.900574
## V2 -5.44915 0.64621 -8.433 6.87e-11 ***
## V5 3.26839 0.80015 4.085 0.000175 ***
## V7 -3.68785 0.77444 -4.762 1.94e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.895 on 46 degrees of freedom
## Multiple R-squared: 0.7034, Adjusted R-squared: 0.684
## F-statistic: 36.36 on 3 and 46 DF, p-value: 3.382e-12
coef function can be used to extract the coefficient
estimates of length p. When signals are sparse, some of estimates are
zero. A comparison shows that the estimates are close to the true effect
sizes in the simulation.beta.hat = coef(sgpv.out.2)
rbind(beta.hat, true.beta)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## beta.hat 0 -5.449154 0 0 3.268389 0 -3.687846 0 0 0
## true.beta 0 -5.000000 0 1 2.333333 0 -3.666667 0 0 0
predict function can be used to predict outcomes using
the selected model.predict(sgpv.out.2) and out-of-sample prediction can be
made by feeding new data set into the newdata argument.preds = predict(sgpv.out.2)
cbind(preds, y)
## preds y
## 1 8.2993645 14.9532559
## 2 -8.8659786 -14.1402243
## 3 8.1576658 5.6770472
## 4 16.7141593 21.2556549
## 5 -6.8915154 -11.9104536
## 6 6.0995442 -6.9072594
## 7 -3.3938972 -6.4016210
## 8 2.6410697 -2.2790190
## 9 -7.8186176 -6.5374117
## 10 4.8723877 3.9731037
## 11 -10.7865487 -9.6668518
## 12 0.6084523 -1.4338933
## 13 3.8841387 5.2537291
## 14 -3.8550714 -2.9899402
## 15 4.7869614 9.6090104
## 16 -1.1651049 -7.4962893
## 17 5.3357619 4.8054879
## 18 -2.2207106 -6.9379824
## 19 -3.1640880 0.2746154
## 20 5.2684414 5.8258520
## 21 9.4316832 -0.1262995
## 22 -15.5982310 -14.6354156
## 23 -9.0735701 -3.7066464
## 24 -14.3862762 -8.9421481
## 25 -7.7604018 -7.1121788
## 26 2.3951003 -2.7577937
## 27 -2.4709820 1.3670330
## 28 3.6316127 3.6091253
## 29 -2.6335952 -3.7277478
## 30 -6.4989520 2.3933283
## 31 -10.2122164 -11.4924695
## 32 -3.5543368 -2.5775804
## 33 5.7377149 10.7925290
## 34 3.2970610 6.2475028
## 35 10.0276588 8.8856192
## 36 -10.2391253 -11.7966930
## 37 4.3474561 6.5661946
## 38 -0.8008003 1.0013047
## 39 -4.0768054 -4.7068644
## 40 0.5462697 10.9009497
## 41 -3.4339914 -2.7473043
## 42 -2.6547422 -6.6247673
## 43 7.5189727 9.6165887
## 44 -5.4459692 -14.7809394
## 45 -12.0955843 -9.2048255
## 46 4.2851927 5.2962132
## 47 -15.1612720 -16.5177728
## 48 -2.5288758 -8.1071853
## 49 -8.7118174 -9.0223750
## 50 5.1952891 14.5666888
In addition to the two-stage algorithm, one-stage algorithm can also be used to select variables when n > p. The computation time is shorter for the one-stage algorithm at the expense of slightly reduced support recovery rate in the limit.
sgpv.out.1 <- pro.sgpv(x,y,stage=1)
sgpv.out.1
## Selected variables are V2 V5 V7
The one-stage algorithm missed V4 and only selected three variables. What happened?
plot(sgpv.out.1)
We can load the Tehran Housing data t.housing stored in
the package.
A dataset containing Tehran housing data. The data set has 372 observations. There are 26 explanatory variables at baseline, including 7 project physical and financial features (V2-V8) and 19 economic variables and indices (V11-V29). The outcome (V9) is the sales price of a real estate single-family residential apartment.
set.seed(999)
# prepare the data
x = t.housing[,-ncol(t.housing)]
y = t.housing$V9
# run ProSGPV
out.sgpv.2 <- pro.sgpv(x = x, y = y)
The two-stage algorithm selects the following variables.
out.sgpv.2
## Selected variables are V8 V12 V13 V15 V17 V26
We can view the summary of the final model.
summary(out.sgpv.2)
##
## Call:
## lm(formula = Response ~ ., data = data.d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1276.35 -75.59 -9.58 59.46 1426.22
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.708e+02 3.471e+01 4.920 1.31e-06 ***
## V8 1.211e+00 1.326e-02 91.277 < 2e-16 ***
## V12 -2.737e+01 2.470e+00 -11.079 < 2e-16 ***
## V13 2.185e+01 2.105e+00 10.381 < 2e-16 ***
## V15 2.041e-03 1.484e-04 13.756 < 2e-16 ***
## V17 -3.459e+00 8.795e-01 -3.934 0.00010 ***
## V26 -4.683e+00 1.780e+00 -2.630 0.00889 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 194.8 on 365 degrees of freedom
## Multiple R-squared: 0.9743, Adjusted R-squared: 0.9739
## F-statistic: 2310 on 6 and 365 DF, p-value: < 2.2e-16
Coefficient estimates can be extracted by use of coef.
coef(out.sgpv.2)
## [1] 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## [6] 0.000000000 1.210755031 0.000000000 -27.367601037 21.853920174
## [11] 0.000000000 0.002040784 0.000000000 -3.459496972 0.000000000
## [16] 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## [21] 0.000000000 0.000000000 -4.683172725 0.000000000 0.000000000
## [26] 0.000000000
In-sample prediction can be made using S3 method predict and an external sample can be provided to make out-of-sample prediction with an argument of newx in the predict function.
head(predict(out.sgpv.2))
## 1 2 3 4 5 6
## 1565.7505 3573.7793 741.7576 212.1297 5966.1682 5724.0172
S3 method plot can be used to visualize the variable selection process.
First, we plot the full solution path with point estimates and 95%
confidence intervals. Note that the null region is in grey. The selected
variables are colored blue on the y-axis. lambda.max
controls the limit of the X-axis.
plot(out.sgpv.2, lambda.max = 0.01)
Alternatively, we can plot the confidence bound that is closer to the null.
plot(out.sgpv.2,lpv=1,lambda.max=0.01)